import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from scipy.stats import boxcox
from sklearn.model_selection import train_test_split
dta = pd.read_csv("cattle_corn_soybean_weather.csv",index_col = 0)
dta.head()
dta['Cattle'] = dta['Cattle production in lb'].apply(lambda x: float(x.replace(",","")))
dta['Corn'] = dta['Corn production in tons'].apply(lambda x: float(x.replace(",","")) if type(x) == str else x)
dta['Soybean'] = dta['Soybeans production in bu'].apply(lambda x: x)
dta.drop(['Cattle production in lb','Corn production in tons','Soybeans production in bu'],axis=1,inplace=True)
dta.head()
g = sns.lmplot(x="AWND", y="Cattle", hue="STATE",
truncate=True, height=15, data=dta)
# Use more informative axis labels than are provided by default
g.set_axis_labels("AWND", "Cattle")
g = sns.lmplot(x="SNOW", y="Cattle", hue="STATE",
truncate=True, height=15, data=dta)
# Use more informative axis labels than are provided by default
g.set_axis_labels("SNOW", "Cattle")
g = sns.lmplot(x="PRCP", y="Cattle", hue="STATE",
truncate=True, height=15, data=dta)
# Use more informative axis labels than are provided by default
g.set_axis_labels("PRCP", "Cattle")
g = sns.lmplot(x="AWND", y="Cattle", hue="STATE",
truncate=True, height=15, data=dta)
# Use more informative axis labels than are provided by default
g.set_axis_labels("AWND", "Cattle")
g = sns.lmplot(x="DT32", y="Cattle", hue="STATE",
truncate=True, height=15, data=dta)
# Use more informative axis labels than are provided by default
g.set_axis_labels("DT32", "Cattle")
g = sns.lmplot(x="DX90", y="Cattle", hue="STATE",
truncate=True, height=15, data=dta)
# Use more informative axis labels than are provided by default
g.set_axis_labels("DX90", "Cattle")
g = sns.lmplot(x="Corn", y="Cattle", hue="STATE",
truncate=True, height=15, data=dta)
# Use more informative axis labels than are provided by default
g.set_axis_labels("Corn", "Cattle")
g = sns.lmplot(x="Soybean", y="Cattle", hue="STATE",
truncate=True, height=15, data=dta)
# Use more informative axis labels than are provided by default
g.set_axis_labels("Soybean", "Cattle")
g = sns.lmplot(x="TMAX", y="Cattle", hue="STATE",
truncate=True, height=15, data=dta)
# Use more informative axis labels than are provided by default
g.set_axis_labels("TMAX", "Cattle")
g = sns.lmplot(x="TMIN", y="Cattle", hue="STATE",
truncate=True, height=15, data=dta)
# Use more informative axis labels than are provided by default
g.set_axis_labels("TMIN", "Cattle")
g = sns.lmplot(x="TAVG", y="Cattle", hue="STATE",
truncate=True, height=15, data=dta)
# Use more informative axis labels than are provided by default
g.set_axis_labels("TAVG", "Cattle")
g = sns.lmplot(x="SNOW", y="Cattle", hue="STATE",
truncate=True, height=15, data=dta)
# Use more informative axis labels than are provided by default
g.set_axis_labels("SNOW", "Cattle")
g = sns.lmplot(x="PRCP", y="Cattle", hue="STATE",
truncate=True, height=15, data=dta)
# Use more informative axis labels than are provided by default
g.set_axis_labels("PRCP", "Cattle")
g = sns.lmplot(x="EMXT", y="Cattle", hue="STATE",
truncate=True, height=15, data=dta)
# Use more informative axis labels than are provided by default
g.set_axis_labels("EMXT", "Cattle")
g = sns.lmplot(x="EMNT", y="Cattle", hue="STATE",
truncate=True, height=15, data=dta)
# Use more informative axis labels than are provided by default
g.set_axis_labels("EMNT", "Cattle")
g = sns.lmplot(x="Year", y="Cattle", hue="STATE",
truncate=True, height=15, data=dta)
# Use more informative axis labels than are provided by default
g.set_axis_labels("Year", "Cattle production in lb")
dta.STATE.unique()
for name in dta.STATE.unique():
g = sns.lmplot(x="Year", y="Cattle",
truncate=True, height=5, data=dta[dta['STATE']==name])
# Use more informative axis labels than are provided by default
g.set_axis_labels("Year", name)
For each state, cattle production fluctuates differently with year.
sns.pairplot(dta.dropna(), hue="STATE")
weather_subset = dta[['TMAX','TMIN','TAVG','EMNT','DX90','DT32','PRCP','SNOW','AWND',"STATE"]]
weather_subset.head()
sns.pairplot(weather_subset.dropna())
corr = weather_subset.corr()
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values,
cmap = "coolwarm",
annot = True)
DT32 and SNOW has strong negative correlation with TAVG.
PRCP and AWND also correlates with temperature parameters, but to a less degree.
states = dta.STATE.unique()
d = {"state": states}
dta_cor = pd.DataFrame(d)
dta_noyear = dta.drop(['Year'], axis=1)
for colname in dta_noyear:
if colname != 'STATE':
dta_cor[colname] = dta_cor['state'].apply(lambda x: dta_noyear[dta_noyear['STATE']==x]['Cattle'].corr(dta_noyear[dta_noyear['STATE']==x][colname]))
dta_cor.set_index('state',inplace=True)
dta_cor.drop(['Cattle'],axis=1,inplace=True)
sns.set()
# Load the example flights dataset and conver to long-form
# Draw a heatmap with the numeric values in each cell
f, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(dta_cor, annot=True, cmap='coolwarm', linewidths=.5, ax=ax)
dta_scaled = dta.drop(['STATE','Year'],axis=1)
dta_scaled.head()
x = dta_scaled.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
dta_scaled = pd.DataFrame(x_scaled,columns = list(dta.drop(['STATE','Year'],axis=1)))
dta_scaled.head()
dta_scaled['STATE'] = dta['STATE']
dta_scaled['Year'] = dta['Year']
dta_scaled.head()
g = sns.lmplot(x="AWND", y="Cattle", hue="STATE",
truncate=True, height=15, data=dta_scaled)
# Use more informative axis labels than are provided by default
g.set_axis_labels("AWND", "Cattle")
dta_scalebystate = dta.copy(deep=True)
dta_scalebystate.head()
for colname in list(dta_scalebystate.drop(['STATE','Year'],axis=1)):
dta_scalebystate[colname] = dta_scalebystate.groupby('STATE')[colname].apply(lambda x: (x-min(x))/(max(x)-min(x)))
dta_scalebystate.head()
dta.head()
g = sns.lmplot(x="Soybean", y="Cattle",
truncate=True, height=15, data=dta_scalebystate)
# Use more informative axis labels than are provided by default
g.set_axis_labels("Soybean", "Cattle")
g = sns.lmplot(x="Corn", y="Cattle",
truncate=True, height=15, data=dta_scalebystate)
# Use more informative axis labels than are provided by default
g.set_axis_labels("Corn", "Cattle")
g = sns.lmplot(x="AWND", y="Cattle",
truncate=True, height=15, data=dta_scalebystate)
# Use more informative axis labels than are provided by default
g.set_axis_labels("AWND", "Cattle")
g = sns.lmplot(x="SNOW", y="Cattle",
truncate=True, height=15, data=dta_scalebystate)
# Use more informative axis labels than are provided by default
g.set_axis_labels("SNOW", "Cattle")
g = sns.lmplot(x="PRCP", y="Cattle",
truncate=True, height=15, data=dta_scalebystate)
# Use more informative axis labels than are provided by default
g.set_axis_labels("PRCP", "Cattle")
g = sns.lmplot(x="DT32", y="Cattle",
truncate=True, height=15, data=dta_scalebystate)
# Use more informative axis labels than are provided by default
g.set_axis_labels("DT32", "Cattle")
g = sns.lmplot(x="DX90", y="Cattle",
truncate=True, height=15, data=dta_scalebystate)
# Use more informative axis labels than are provided by default
g.set_axis_labels("DX90", "Cattle")
g = sns.lmplot(x="EMXT", y="Cattle",
truncate=True, height=15, data=dta_scalebystate)
# Use more informative axis labels than are provided by default
g.set_axis_labels("EMXT", "Cattle")
g = sns.lmplot(x="EMNT", y="Cattle",
truncate=True, height=15, data=dta_scalebystate)
# Use more informative axis labels than are provided by default
g.set_axis_labels("EMNT", "Cattle")
g = sns.lmplot(x="TMAX", y="Cattle",
truncate=True, height=15, data=dta_scalebystate)
# Use more informative axis labels than are provided by default
g.set_axis_labels("TMAX", "Cattle")
g = sns.lmplot(x="TMIN", y="Cattle",
truncate=True, height=15, data=dta_scalebystate)
# Use more informative axis labels than are provided by default
g.set_axis_labels("TMIN", "Cattle")
g = sns.lmplot(x="TAVG", y="Cattle",
truncate=True, height=15, data=dta_scalebystate)
# Use more informative axis labels than are provided by default
g.set_axis_labels("TAVG", "Cattle")
sns.set(style="whitegrid")
# Draw a scatter plot while assigning point colors and sizes to different
# variables in the dataset
f, ax = plt.subplots(figsize=(15,15))
sns.despine(f, left=True, bottom=True)
sns.scatterplot(x="Year", y="Cattle",
hue="STATE",
linewidth=0,
data=dta_scalebystate, ax=ax)
sns.set(style="whitegrid")
# Draw a scatter plot while assigning point colors and sizes to different
# variables in the dataset
f, ax = plt.subplots(figsize=(15,15))
sns.despine(f, left=True, bottom=True)
sns.scatterplot(x="Year", y="TAVG",
hue="STATE",
linewidth=0,
data=dta_scalebystate, ax=ax)
g = sns.lmplot(x="Year", y="Cattle",
truncate=True, height=15, data=dta_scalebystate)
# Use more informative axis labels than are provided by default
g.set_axis_labels("Year", "Cattle")
g = sns.lmplot(x="Year", y="TMAX",
truncate=True, height=15, data=dta_scalebystate)
# Use more informative axis labels than are provided by default
g.set_axis_labels("Year", "TMAX")
g = sns.lmplot(x="Year", y="TMIN",
truncate=True, height=15, data=dta_scalebystate)
# Use more informative axis labels than are provided by default
g.set_axis_labels("Year", "TMIN")
g = sns.lmplot(x="Year", y="SNOW",
truncate=True, height=15, data=dta_scalebystate)
# Use more informative axis labels than are provided by default
g.set_axis_labels("Year", "SNOW")
sns.distplot(dta['Cattle'])
sns.distplot(np.log(np.log(dta['Cattle'])))
sns.distplot(np.log(dta['Cattle']))
sns.distplot(boxcox(dta['Cattle'],0.5))
sns.distplot(dta_scalebystate['Cattle'])
sns.distplot(np.log(dta_scalebystate['Cattle']+1))
sns.distplot(dta_scaled['Cattle'])
sns.distplot(np.log(dta_scaled['Cattle']+1))
sns.distplot(dta_scalebystate['TAVG'])
sns.distplot(dta['TAVG'])
sns.distplot(dta.dropna()['Corn'])
sns.distplot(dta_scalebystate.dropna()['Corn'])
dta.columns
dta.head()
# will ignore the STATE and Year first
x = dta.dropna()[['TMAX', 'TMIN', 'TAVG', 'EMXT', 'EMNT', 'DX90', 'DT32',
'PRCP', 'SNOW', 'AWND', 'Corn', 'Soybean']]
y = dta.dropna()['Cattle']
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.4, random_state = None)
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(xTrain,yTrain)
print(lm.intercept_)
print(lm.coef_)
cdf = pd.DataFrame(lm.coef_, x.columns, columns=['coef'])
print(cdf)
predictions = lm.predict(xTest)
plt.scatter(yTest,predictions)
sns.distplot(yTest-predictions)
from sklearn import metrics
metrics.mean_absolute_error(yTest,predictions)
metrics.r2_score(yTest,predictions)
predictions2 = lm.predict(xTrain)
metrics.r2_score(yTrain,predictions2)
### predictions
### include regularization
from sklearn import linear_model
reg = linear_model.Lasso(alpha=1)
reg.fit(xTrain,yTrain)
reg.coef_
predictions = reg.predict(xTrain)
print(metrics.r2_score(yTrain,predictions))
predictions = reg.predict(xTest)
metrics.r2_score(yTest,predictions)
plt.scatter(np.log(yTest),np.log(predictions))
#sns.distplot(yTest-predictions)
sns.distplot(yTest-predictions)
dta.columns
dta.head()
dta.boxplot(column='Cattle',by='STATE',figsize = (20,8))
dta_selected = dta[['Year','STATE','TAVG','PRCP','SNOW','AWND','Corn','Soybean','Cattle']]
dta_selected.columns
x = dta.dropna()[['TAVG','STATE','PRCP', 'SNOW', 'AWND', 'Corn', 'Soybean']]
y = dta.dropna()['Cattle']
x.shape
x.STATE.unique()
dta_selected.dropna(inplace=True)
dta_selected.shape
dta_selected.STATE.unique()
train_r2 = []
test_r2 = []
for state in dta_selected.STATE.unique():
tmp = dta_selected[dta_selected['STATE']==state]
x = tmp[['TAVG','PRCP', 'SNOW', 'AWND', 'Corn', 'Soybean']]
y = tmp['Cattle']
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.4, random_state = None)
lm = LinearRegression()
lm.fit(xTrain,yTrain)
prediction1 = lm.predict(xTrain)
prediction2 = lm.predict(xTest)
train_r2.append(metrics.r2_score(yTrain,prediction1))
test_r2.append(metrics.r2_score(yTest,prediction2))
states = [state for state in dta_selected.STATE.unique()]
dt = {}
dt['STATE'] = states
dt['train_r2'] = train_r2
dt['test_r2'] = test_r2
ddf = pd.DataFrame(dt)
x = dta.dropna()[['STATE', 'TAVG', 'EMXT', 'EMNT', 'DX90', 'DT32',
'PRCP', 'SNOW', 'AWND', 'Corn', 'Soybean']]
#x = dta.dropna()[['TMAX', 'TMIN', 'TAVG', 'EMXT', 'EMNT', 'DX90', 'DT32',
# 'PRCP', 'SNOW', 'AWND', 'Corn', 'Soybean']]
#x = dta.dropna()[[ 'TAVG','PRCP', 'SNOW', 'AWND', 'Corn', 'Soybean','STATE']]
y = dta.dropna()['Cattle']
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.4, random_state = None)
lm = LinearRegression()
lm.fit(xTrain,yTrain)
y_pred = lm.predict(xTest)
print(metrics.r2_score(yTest,y_pred))
y_pred_train = lm.predict(xTrain)
print(metrics.r2_score(yTrain,y_pred_train))
x['STATE'].unique()
state_list = x['STATE'].unique()
state_list
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
x['STATE'] = labelencoder.fit_transform(x['STATE'])
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.4, random_state = None)
lm = LinearRegression()
lm.fit(xTrain,yTrain)
y_pred = lm.predict(xTest)
print(metrics.r2_score(yTest,y_pred))
y_pred_train = lm.predict(xTrain)
print(metrics.r2_score(yTrain,y_pred_train))
plt.scatter(np.log(yTest),np.log(y_pred))
plt.scatter(yTest,y_pred)
cdf = pd.DataFrame(lm.coef_, x.columns, columns=['coef'])
print(cdf)
cdf.to_csv(r'cdf.csv')
labelencoder.inverse_transform(range(31))
cdf2 = pd.DataFrame(labelencoder.inverse_transform(range(31)),range(31),columns=['label'])
cdf2.to_csv(r'cdf2.csv')
from sklearn.tree import DecisionTreeRegressor
clf = DecisionTreeRegressor(random_state=None)
clf.fit(X_train_one_hot,yTrain)
y_pred = clf.predict(X_test_one_hot)
print(metrics.r2_score(yTest,y_pred))
y_pred2 = clf.predict(X_train_one_hot)
print(metrics.r2_score(yTrain,y_pred2))
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
dta.head()
range(12)
x.head()